wait
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
StateOfInterest = c("Arizona", "California", "Minnesota", "New Mexico", "New York",
"Oklahoma", "South Carolina", "Tennessee", "Utah", "Virginia",
"West Virginia", "Wisconsin")
library(readr)
county_data_abridged = read_csv("county_data_abridged.csv")
dim(county_data_abridged)
## [1] 3244 87
names(county_data_abridged)
## [1] "countyFIPS" "STATEFP"
## [3] "COUNTYFP" "CountyName"
## [5] "StateName" "State"
## [7] "lat" "lon"
## [9] "POP_LATITUDE" "POP_LONGITUDE"
## [11] "CensusRegionName" "CensusDivisionName"
## [13] "Rural-UrbanContinuumCode2013" "PopulationEstimate2018"
## [15] "PopTotalMale2017" "PopTotalFemale2017"
## [17] "FracMale2017" "PopulationEstimate65+2017"
## [19] "PopulationDensityperSqMile2010" "CensusPopulation2010"
## [21] "MedianAge2010" "#EligibleforMedicare2018"
## [23] "MedicareEnrollment,AgedTot2017" "3-YrDiabetes2015-17"
## [25] "DiabetesPercentage" "HeartDiseaseMortality"
## [27] "StrokeMortality" "Smokers_Percentage"
## [29] "RespMortalityRate2014" "#FTEHospitalTotal2017"
## [31] "TotalM.D.'s,TotNon-FedandFed2017" "#HospParticipatinginNetwork2017"
## [33] "#Hospitals" "#ICU_beds"
## [35] "dem_to_rep_ratio" "PopMale<52010"
## [37] "PopFmle<52010" "PopMale5-92010"
## [39] "PopFmle5-92010" "PopMale10-142010"
## [41] "PopFmle10-142010" "PopMale15-192010"
## [43] "PopFmle15-192010" "PopMale20-242010"
## [45] "PopFmle20-242010" "PopMale25-292010"
## [47] "PopFmle25-292010" "PopMale30-342010"
## [49] "PopFmle30-342010" "PopMale35-442010"
## [51] "PopFmle35-442010" "PopMale45-542010"
## [53] "PopFmle45-542010" "PopMale55-592010"
## [55] "PopFmle55-592010" "PopMale60-642010"
## [57] "PopFmle60-642010" "PopMale65-742010"
## [59] "PopFmle65-742010" "PopMale75-842010"
## [61] "PopFmle75-842010" "PopMale>842010"
## [63] "PopFmle>842010" "3-YrMortalityAge<1Year2015-17"
## [65] "3-YrMortalityAge1-4Years2015-17" "3-YrMortalityAge5-14Years2015-17"
## [67] "3-YrMortalityAge15-24Years2015-17" "3-YrMortalityAge25-34Years2015-17"
## [69] "3-YrMortalityAge35-44Years2015-17" "3-YrMortalityAge45-54Years2015-17"
## [71] "3-YrMortalityAge55-64Years2015-17" "3-YrMortalityAge65-74Years2015-17"
## [73] "3-YrMortalityAge75-84Years2015-17" "3-YrMortalityAge85+Years2015-17"
## [75] "mortality2015-17Estimated" "stay at home"
## [77] ">50 gatherings" ">500 gatherings"
## [79] "public schools" "restaurant dine-in"
## [81] "entertainment/gym" "federal guidelines"
## [83] "foreign travel ban" "SVIPercentile"
## [85] "HPSAShortage" "HPSAServedPop"
## [87] "HPSAUnderservedPop"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "#EligibleforMedicare2018"] = "EligibleforMedicare2018"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "#FTEHospitalTotal2017"] = "FTEHospitalTotal2017"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "#HospParticipatinginNetwork2017"] = "HospParticipatinginNetwork2017"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "#Hospitals"] = "Hospitals"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "#ICU_beds"] = "ICU_beds"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "PopulationEstimate65+2017"] = "PopulationEstimate_above65_2017"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "stay at home"] = "stay_at_home"
colnames(county_data_abridged)[colnames(county_data_abridged)
== ">50 gatherings"] = "above_50_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged)
== ">500 gatherings"] = "above_500_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "public schools"] = "public_schools"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "restaurant dine-in"] = "restaurant_dine_in"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "entertainment/gym"] = "entertainment_gym"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "federal guidelines"] = "federal_guidelines"
colnames(county_data_abridged)[colnames(county_data_abridged)
== "foreign travel ban"] = "foreign_travel_ban"
data = subset(county_data_abridged,
select = c(State, CountyName, POP_LATITUDE, POP_LONGITUDE,
PopulationEstimate2018, PopTotalMale2017,
PopulationEstimate_above65_2017, PopulationDensityperSqMile2010,
DiabetesPercentage, Smokers_Percentage,
HeartDiseaseMortality, StrokeMortality,
Hospitals, ICU_beds, HospParticipatinginNetwork2017,
stay_at_home, above_50_gatherings, above_500_gatherings,
restaurant_dine_in, entertainment_gym))
data = na.omit(data)
data = droplevels(data)
data$stay_at_home = data$stay_at_home - range(data$stay_at_home)[1]
data$above_50_gatherings = data$above_50_gatherings - range(data$above_50_gatherings)[1]
data$above_500_gatherings = data$above_500_gatherings - range(data$above_500_gatherings)[1]
data$restaurant_dine_in = data$restaurant_dine_in - range(data$restaurant_dine_in)[1]
data$entertainment_gym = data$entertainment_gym - range(data$entertainment_gym)[1]
str(data)
## tibble [2,585 × 20] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:2585] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ CountyName : chr [1:2585] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ POP_LATITUDE : num [1:2585] 32.5 30.5 31.8 33 34 ...
## $ POP_LONGITUDE : num [1:2585] -86.5 -87.8 -85.3 -87.1 -86.6 ...
## $ PopulationEstimate2018 : num [1:2585] 55601 218022 24881 22400 57840 ...
## $ PopTotalMale2017 : num [1:2585] 27007 103225 13335 12138 28607 ...
## $ PopulationEstimate_above65_2017: num [1:2585] 8392 42413 4757 3632 10351 ...
## $ PopulationDensityperSqMile2010 : num [1:2585] 91.8 114.7 31 36.8 88.9 ...
## $ DiabetesPercentage : num [1:2585] 9.9 8.5 15.7 13.3 14.9 22.4 16.9 15.6 17.5 12.2 ...
## $ Smokers_Percentage : num [1:2585] 18.1 17.5 22 19.1 19.2 ...
## $ HeartDiseaseMortality : num [1:2585] 204 183 220 226 225 ...
## $ StrokeMortality : num [1:2585] 56.1 41.9 49 57.2 52.8 54.1 59 44 45.2 47.3 ...
## $ Hospitals : num [1:2585] 1 3 1 1 1 1 1 2 0 1 ...
## $ ICU_beds : num [1:2585] 6 51 5 0 6 0 7 24 0 0 ...
## $ HospParticipatinginNetwork2017 : num [1:2585] 0 0 0 0 1 0 0 0 0 0 ...
## $ stay_at_home : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
## $ above_50_gatherings : num [1:2585] 5 5 5 5 5 5 5 5 5 5 ...
## $ above_500_gatherings : num [1:2585] 2 2 2 2 2 2 2 2 2 2 ...
## $ restaurant_dine_in : num [1:2585] 7 7 7 7 7 7 7 7 7 7 ...
## $ entertainment_gym : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
## - attr(*, "na.action")= 'omit' Named int [1:659] 68 69 70 71 72 73 74 75 76 77 ...
## ..- attr(*, "names")= chr [1:659] "68" "69" "70" "71" ...
data_demographic_county = data
timeseries = read_csv("../datasets/timeseries.csv")
data = timeseries
data = subset(data, country == "United States" & level == "state")
data = subset(data, !(name %in% c("Unassigned cases, Arkansas, US",
"Unassigned cases, Georgia, US", "Unassigned cases, Illinois, US",
"Unassigned cases, Iowa, US", "Unassigned cases, Maine, US",
"Unassigned cases, Massachusetts, US", "Unassigned cases, North Dakota, US",
"Washington, D.C., US")))
data$state = matrix(unlist(strsplit(as.character(data$name), ", ")), ncol = 2, byrow = TRUE)[, 1]
data = subset(data,
select = c(state, date, cases, deaths, recovered))
data = subset(data, state %in% StateOfInterest)
data$state = as.factor(data$state)
data$cases = as.numeric(data$cases)
data$deaths = as.numeric(data$deaths)
data$recovered = as.numeric(data$recovered)
data = na.omit(data)
data = droplevels(data)
str(data)
## tibble [1,578 × 5] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 12 levels "Arizona","California",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date[1:1578], format: "2020-04-14" "2020-04-15" ...
## $ cases : num [1:1578] 3806 3962 4234 4507 4719 ...
## $ deaths : num [1:1578] 131 142 150 169 177 184 187 208 229 249 ...
## $ recovered: num [1:1578] 249 385 460 539 539 ...
## - attr(*, "problems")= tibble [1,666,558 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:1666558] 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 ...
## ..$ col : chr [1:1666558] "state" "state" "state" "state" ...
## ..$ expected: chr [1:1666558] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
## ..$ actual : chr [1:1666558] "Burgenland" "Burgenland" "Burgenland" "Burgenland" ...
## ..$ file : chr [1:1666558] "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" ...
## - attr(*, "na.action")= 'omit' Named int [1:894] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:894] "1" "2" "3" "4" ...
range(data$cases)
## [1] 237 613076
range(data$deaths)
## [1] 1 25250
range(data$recovered)
## [1] 2 266287
data_cases_state = data
model_out = read_csv("../datasets/model_out.csv")
str(model_out)
## tibble [73 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ state : chr [1:73] "Arizona" "Arizona" "Arizona" "Arizona" ...
## $ startdate: Date[1:73], format: "2020-04-16" "2020-05-01" ...
## $ enddate : Date[1:73], format: "2020-04-30" "2020-05-15" ...
## $ k : num [1:73] 0.783 0.863 0.865 0.949 0.986 ...
## $ sigma : num [1:73] 0.0714 0.0714 0.0714 0.0714 0.0714 ...
## $ lamda : num [1:73] 0.0563 0.07 0.1266 0.0784 0.0925 ...
## $ c : num [1:73] 1 0 0.127 1 0 ...
## $ alpha : num [1:73] 0.06825 0.00721 0.06765 0.10224 0.05641 ...
## $ omega : num [1:73] 0.00275 0.003091 0.001429 0.001371 0.000698 ...
## $ miu : num [1:73] 0.0384 0.0175 0.0338 0.036 0.014 ...
## - attr(*, "spec")=
## .. cols(
## .. state = col_character(),
## .. startdate = col_date(format = ""),
## .. enddate = col_date(format = ""),
## .. k = col_double(),
## .. sigma = col_double(),
## .. lamda = col_double(),
## .. c = col_double(),
## .. alpha = col_double(),
## .. omega = col_double(),
## .. miu = col_double()
## .. )
data_parm = model_out
data = data_parm
n = length(data$state)
data$cases = rep(0, n)
data$deaths = rep(0, n)
data$recovered = rep(0, n)
for (i in 1:n){
j = data_cases_state$state == data$state[i] & data_cases_state$date == data$startdate[i]
if (sum(j) == 0){
data[i, ] = NA
}else{
data[i, c("cases", "deaths", "recovered")] = data_cases_state[j, c("cases", "deaths", "recovered")]
}
}
data = na.omit(data)
data_demographic_state = as.data.frame(matrix(nrow = length(data$state), ncol = length(colnames(data_demographic_county)) - 1))
colnames(data_demographic_state) = colnames(data_demographic_county)[colnames(data_demographic_county) != "CountyName"]
data_demographic_state$State = data$state
for (s in data_demographic_state$State){
for (k in 2:ncol(data_demographic_state)){
data_demographic_state[data_demographic_state$State == s, k] =
mean(unlist(data_demographic_county[data_demographic_county$State == s, k+1]))
}
}
data = cbind(data,
subset(data_demographic_state[, colnames(data_demographic_state)[colnames(data_demographic_state) != "State"]]))
data = na.omit(data)
data$days = rep(0, nrow(data))
data$temp = rep(0, nrow(data))
for (i in 1:nrow(data)){
data$temp = diff.Date(c(data[i, ]$startdate, data[i, ]$enddate))
}
for (s in data$state){
for (d in 1:nrow(subset(data, state == s))){
data[data$state == s, "days"][d] = sum(subset(data, state == s)[1:d, "temp"])
}
}
data = data[, colnames(data)[colnames(data) != "temp"]]
data_unamed = data[, colnames(data)[!(colnames(data) %in% c("state", "startdate", "enddate"))]]
# head(data_demographic_county, 10)
# head(data_demographic_state, 10)
# head(data[, c("state", "startdate", "enddate", "days")], 10)
head(data, 10)
## state startdate enddate k sigma lamda c alpha
## 1 Arizona 2020-04-16 2020-04-30 0.7834 0.07143 0.05633 1.00000 0.06825
## 2 Arizona 2020-05-01 2020-05-15 0.8631 0.07143 0.07001 0.00000 0.00721
## 3 Arizona 2020-05-16 2020-05-30 0.8653 0.07143 0.12657 0.12709 0.06765
## 4 Arizona 2020-06-01 2020-06-15 0.9492 0.07143 0.07839 1.00000 0.10224
## 5 Arizona 2020-06-16 2020-06-30 0.9864 0.07143 0.09253 0.00000 0.05641
## 6 Arizona 2020-07-01 2020-07-15 0.9732 0.07143 0.06093 0.00000 0.00000
## 7 California 2020-04-16 2020-04-30 0.8574 0.07143 0.07211 0.07752 0.02728
## 8 California 2020-05-01 2020-05-15 0.9481 0.07143 0.02832 1.00000 0.01925
## 9 California 2020-05-16 2020-05-30 0.9598 0.07143 0.04972 0.40540 0.03497
## 10 California 2020-06-01 2020-06-15 1.1270 0.07143 0.10206 0.01176 0.03073
## omega miu cases deaths recovered POP_LATITUDE POP_LONGITUDE
## 1 0.0027505 0.038429 4234 150 460 33.58 -111.5
## 2 0.0030913 0.017520 7962 330 1528 33.58 -111.5
## 3 0.0014293 0.033751 13631 679 3357 33.58 -111.5
## 4 0.0013712 0.036006 20123 917 4869 33.58 -111.5
## 5 0.0006979 0.013973 39097 1219 6598 33.58 -111.5
## 6 0.0000000 0.003805 84092 1720 9715 33.58 -111.5
## 7 0.0023628 0.007923 28035 970 1753 37.82 -120.9
## 8 0.0013106 0.006909 52152 2131 5130 37.82 -120.9
## 9 0.0009157 0.012128 78704 3207 9098 37.82 -120.9
## 10 0.0006310 0.012709 115032 4219 17585 37.82 -120.9
## PopulationEstimate2018 PopTotalMale2017 PopulationEstimate_above65_2017
## 1 478110 232553 80116
## 2 478110 232553 80116
## 3 478110 232553 80116
## 4 478110 232553 80116
## 5 478110 232553 80116
## 6 478110 232553 80116
## 7 682018 338751 94920
## 8 682018 338751 94920
## 9 682018 338751 94920
## 10 682018 338751 94920
## PopulationDensityperSqMile2010 DiabetesPercentage Smokers_Percentage
## 1 52.05 10.060 16.48
## 2 52.05 10.060 16.48
## 3 52.05 10.060 16.48
## 4 52.05 10.060 16.48
## 5 52.05 10.060 16.48
## 6 52.05 10.060 16.48
## 7 663.26 8.505 12.09
## 8 663.26 8.505 12.09
## 9 663.26 8.505 12.09
## 10 663.26 8.505 12.09
## HeartDiseaseMortality StrokeMortality Hospitals ICU_beds
## 1 148.8 30.90 5.067 103.9
## 2 148.8 30.90 5.067 103.9
## 3 148.8 30.90 5.067 103.9
## 4 148.8 30.90 5.067 103.9
## 5 148.8 30.90 5.067 103.9
## 6 148.8 30.90 5.067 103.9
## 7 153.9 37.89 5.672 126.5
## 8 153.9 37.89 5.672 126.5
## 9 153.9 37.89 5.672 126.5
## 10 153.9 37.89 5.672 126.5
## HospParticipatinginNetwork2017 stay_at_home above_50_gatherings
## 1 1.800 12 2
## 2 1.800 12 2
## 3 1.800 12 2
## 4 1.800 12 2
## 5 1.800 12 2
## 6 1.800 12 2
## 7 1.845 0 4
## 8 1.845 0 4
## 9 1.845 0 4
## 10 1.845 0 4
## above_500_gatherings restaurant_dine_in entertainment_gym days
## 1 6 7 7 14
## 2 6 7 7 28
## 3 6 7 7 42
## 4 6 7 7 56
## 5 6 7 7 70
## 6 6 7 7 84
## 7 8 3 3 14
## 8 8 3 3 28
## 9 8 3 3 42
## 10 8 3 3 56
pairs() Plotset.seed(42)
num_obs = nrow(data_unamed) # total number of observations
num_trn = round(num_obs * 0.90) # number of observations for the training data
trn_idx = sample(num_obs, num_trn) # randomly generate the index for the training data
data_trn = data_unamed[trn_idx, ] # training data
data_tst = data_unamed[-trn_idx, ] # testing data
k# full additive model
mod_k_full = lm(k ~ ., data = data_trn)
test_mod(mod_k_full, k = 1)
## Warning in predict.lm(model, newdata = data_tst): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, newdata = data_tst): prediction from a rank-
## deficient fit may be misleading
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.221852 0.475015 0.016221 0.000191 29.000000 0.149099 14.731106
summary(mod_k_full)
##
## Call:
## lm(formula = k ~ ., data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4647 -0.0452 0.0073 0.0503 0.2077
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.62e+01 3.93e+00 4.13 0.00019 ***
## sigma NA NA NA NA
## lamda -6.46e+00 1.20e+00 -5.41 3.5e-06 ***
## c -3.05e-01 7.17e-02 -4.26 0.00012 ***
## alpha 3.08e+00 1.02e+00 3.00 0.00465 **
## omega 2.32e+01 2.59e+01 0.89 0.37630
## miu 4.41e+00 1.99e+00 2.21 0.03272 *
## cases -4.57e-07 1.64e-06 -0.28 0.78232
## deaths 1.32e-05 1.91e-05 0.69 0.49356
## recovered -1.54e-06 6.60e-06 -0.23 0.81645
## POP_LATITUDE -2.01e-01 5.03e-02 -3.99 0.00028 ***
## POP_LONGITUDE 6.94e-02 1.69e-02 4.11 0.00019 ***
## PopulationEstimate2018 -1.13e-04 2.90e-05 -3.91 0.00036 ***
## PopTotalMale2017 2.15e-04 5.58e-05 3.85 0.00043 ***
## PopulationEstimate_above65_2017 7.20e-05 1.76e-05 4.10 0.00020 ***
## PopulationDensityperSqMile2010 2.45e-05 1.41e-04 0.17 0.86330
## DiabetesPercentage -8.36e-01 2.00e-01 -4.19 0.00015 ***
## Smokers_Percentage 3.32e-01 8.49e-02 3.91 0.00036 ***
## HeartDiseaseMortality -2.90e-02 9.27e-03 -3.13 0.00331 **
## StrokeMortality 1.61e-01 4.45e-02 3.62 0.00083 ***
## Hospitals NA NA NA NA
## ICU_beds NA NA NA NA
## HospParticipatinginNetwork2017 NA NA NA NA
## stay_at_home NA NA NA NA
## above_50_gatherings NA NA NA NA
## above_500_gatherings NA NA NA NA
## restaurant_dine_in NA NA NA NA
## entertainment_gym NA NA NA NA
## days 3.16e-03 1.46e-03 2.17 0.03613 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.136 on 39 degrees of freedom
## Multiple R-squared: 0.647, Adjusted R-squared: 0.475
## F-statistic: 3.76 on 19 and 39 DF, p-value: 0.000227
# small additive model
mod_k_1 = lm(k ~ days, data = data_trn)
# large additive model
mod_k_2 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma, data = data_trn)
test_mod(mod_k_1, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 1.867e-01 2.328e-02 9.083e-01 4.534e-13 2.000e+00 5.830e-02 5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.221852 0.475015 0.016221 0.000191 20.000000 0.149099 14.731106
summary(mod_k_1)
##
## Call:
## lm(formula = k ~ days, data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2560 -0.0326 0.0395 0.0661 0.2381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.805555 0.054607 14.75 <2e-16 ***
## days 0.001488 0.000964 1.54 0.13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.186 on 57 degrees of freedom
## Multiple R-squared: 0.0401, Adjusted R-squared: 0.0233
## F-statistic: 2.38 on 1 and 57 DF, p-value: 0.128
summary(mod_k_2)
##
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 -
## stay_at_home - above_50_gatherings - above_500_gatherings -
## restaurant_dine_in - entertainment_gym - sigma, data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4647 -0.0452 0.0073 0.0503 0.2077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.62e+01 3.93e+00 4.13 0.00019 ***
## lamda -6.46e+00 1.20e+00 -5.41 3.5e-06 ***
## c -3.05e-01 7.17e-02 -4.26 0.00012 ***
## alpha 3.08e+00 1.02e+00 3.00 0.00465 **
## omega 2.32e+01 2.59e+01 0.89 0.37630
## miu 4.41e+00 1.99e+00 2.21 0.03272 *
## cases -4.57e-07 1.64e-06 -0.28 0.78232
## deaths 1.32e-05 1.91e-05 0.69 0.49356
## recovered -1.54e-06 6.60e-06 -0.23 0.81645
## POP_LATITUDE -2.01e-01 5.03e-02 -3.99 0.00028 ***
## POP_LONGITUDE 6.94e-02 1.69e-02 4.11 0.00019 ***
## PopulationEstimate2018 -1.13e-04 2.90e-05 -3.91 0.00036 ***
## PopTotalMale2017 2.15e-04 5.58e-05 3.85 0.00043 ***
## PopulationEstimate_above65_2017 7.20e-05 1.76e-05 4.10 0.00020 ***
## PopulationDensityperSqMile2010 2.45e-05 1.41e-04 0.17 0.86330
## DiabetesPercentage -8.36e-01 2.00e-01 -4.19 0.00015 ***
## Smokers_Percentage 3.32e-01 8.49e-02 3.91 0.00036 ***
## HeartDiseaseMortality -2.90e-02 9.27e-03 -3.13 0.00331 **
## StrokeMortality 1.61e-01 4.45e-02 3.62 0.00083 ***
## days 3.16e-03 1.46e-03 2.17 0.03613 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.136 on 39 degrees of freedom
## Multiple R-squared: 0.647, Adjusted R-squared: 0.475
## F-statistic: 3.76 on 19 and 39 DF, p-value: 0.000227
# intermediate model
mod_k_3 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
- cases - deaths - omega - recovered - PopulationDensityperSqMile2010
+ log(days) - days
+ stay_at_home - PopulationEstimate2018 - PopTotalMale2017
+ I(POP_LATITUDE ^ 2) + + I(POP_LONGITUDE ^ 2),
data = data_trn)
mod_k_4 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
- cases - deaths - omega - recovered - PopulationDensityperSqMile2010 +
log(days) - days,
data = data_trn)
test_mod(mod_k_1, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 1.867e-01 2.328e-02 9.083e-01 4.534e-13 2.000e+00 5.830e-02 5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.221852 0.475015 0.016221 0.000191 20.000000 0.149099 14.731106
test_mod(mod_k_3, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.2054978 0.5247754 0.0017914 0.0001737 16.0000000 0.1552475 14.6026990
test_mod(mod_k_4, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.2042750 0.5311550 0.0009837 0.0004907 15.0000000 0.1546657 14.4151641
summary(mod_k_3)
##
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 -
## stay_at_home - above_50_gatherings - above_500_gatherings -
## restaurant_dine_in - entertainment_gym - sigma - cases -
## deaths - omega - recovered - PopulationDensityperSqMile2010 +
## log(days) - days + stay_at_home - PopulationEstimate2018 -
## PopTotalMale2017 + I(POP_LATITUDE^2) + +I(POP_LONGITUDE^2),
## data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4742 -0.0548 0.0072 0.0526 0.2431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13e+03 4.10e+02 2.75 0.0087 **
## lamda -6.79e+00 1.13e+00 -6.02 3.4e-07 ***
## c -3.12e-01 6.69e-02 -4.67 2.9e-05 ***
## alpha 3.83e+00 8.45e-01 4.54 4.5e-05 ***
## miu 4.58e+00 1.71e+00 2.69 0.0102 *
## POP_LATITUDE -2.11e+01 7.52e+00 -2.80 0.0076 **
## POP_LONGITUDE 1.51e+01 5.58e+00 2.70 0.0098 **
## PopulationEstimate_above65_2017 -1.64e-04 6.58e-05 -2.50 0.0163 *
## DiabetesPercentage -8.55e+00 3.03e+00 -2.82 0.0072 **
## Smokers_Percentage 4.53e+00 1.62e+00 2.80 0.0077 **
## HeartDiseaseMortality -8.73e-02 2.73e-02 -3.20 0.0026 **
## StrokeMortality 4.52e-01 1.34e-01 3.38 0.0016 **
## log(days) 1.02e-01 2.99e-02 3.41 0.0014 **
## stay_at_home 1.14e+00 4.42e-01 2.59 0.0129 *
## I(POP_LATITUDE^2) 2.65e-01 9.49e-02 2.79 0.0078 **
## I(POP_LONGITUDE^2) 7.85e-02 2.91e-02 2.70 0.0100 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.13 on 43 degrees of freedom
## Multiple R-squared: 0.648, Adjusted R-squared: 0.525
## F-statistic: 5.27 on 15 and 43 DF, p-value: 8.7e-06
summary(mod_k_4)
##
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 -
## stay_at_home - above_50_gatherings - above_500_gatherings -
## restaurant_dine_in - entertainment_gym - sigma - cases -
## deaths - omega - recovered - PopulationDensityperSqMile2010 +
## log(days) - days, data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4732 -0.0497 -0.0017 0.0607 0.2366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.3989580 3.5627641 4.32 8.7e-05 ***
## lamda -6.8317626 1.1179898 -6.11 2.3e-07 ***
## c -0.3221741 0.0646726 -4.98 1.0e-05 ***
## alpha 3.6975440 0.8116795 4.56 4.1e-05 ***
## miu 4.9379779 1.6006925 3.08 0.00351 **
## POP_LATITUDE -0.1920080 0.0440854 -4.36 7.8e-05 ***
## POP_LONGITUDE 0.0662907 0.0153167 4.33 8.5e-05 ***
## PopulationEstimate2018 -0.0001105 0.0000270 -4.09 0.00018 ***
## PopTotalMale2017 0.0002107 0.0000518 4.06 0.00020 ***
## PopulationEstimate_above65_2017 0.0000669 0.0000154 4.35 8.1e-05 ***
## DiabetesPercentage -0.7787347 0.1646073 -4.73 2.3e-05 ***
## Smokers_Percentage 0.3062246 0.0605033 5.06 7.9e-06 ***
## HeartDiseaseMortality -0.0257963 0.0058214 -4.43 6.1e-05 ***
## StrokeMortality 0.1439159 0.0288281 4.99 9.9e-06 ***
## log(days) 0.0993978 0.0294363 3.38 0.00154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.129 on 44 degrees of freedom
## Multiple R-squared: 0.644, Adjusted R-squared: 0.531
## F-statistic: 5.69 on 14 and 44 DF, p-value: 4.16e-06
# intermediate model
# relatively bad models
mod_k_5 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
- (lamda + c + alpha + omega + miu),
data = data_trn)
mod_k_6 = lm(k ~ lamda + c + alpha + omega + miu,
data = data_trn)
test_mod(mod_k_1, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 1.867e-01 2.328e-02 9.083e-01 4.534e-13 2.000e+00 5.830e-02 5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.221852 0.475015 0.016221 0.000191 20.000000 0.149099 14.731106
test_mod(mod_k_3, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.2054978 0.5247754 0.0017914 0.0001737 16.0000000 0.1552475 14.6026990
test_mod(mod_k_4, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 0.2042750 0.5311550 0.0009837 0.0004907 15.0000000 0.1546657 14.4151641
test_mod(mod_k_5, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 2.004e-01 1.101e-01 2.567e-01 2.605e-12 1.500e+01 9.543e-02 7.160e+00
test_mod(mod_k_6, k = 1)
## loocv_rmse adj_r2 bp_pval.BP sw_pval num_params test_rmse perc_err
## 2.135e-01 8.328e-02 1.932e-02 7.076e-10 6.000e+00 5.237e-02 5.250e+00
summary(mod_k_5)
##
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 -
## stay_at_home - above_50_gatherings - above_500_gatherings -
## restaurant_dine_in - entertainment_gym - sigma - (lamda +
## c + alpha + omega + miu), data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9987 -0.0268 0.0054 0.0370 0.2963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.44e+00 4.21e+00 2.01 0.051 .
## cases 5.03e-07 1.92e-06 0.26 0.795
## deaths 1.45e-05 2.31e-05 0.63 0.533
## recovered -5.29e-06 7.18e-06 -0.74 0.465
## POP_LATITUDE -9.83e-02 5.46e-02 -1.80 0.079 .
## POP_LONGITUDE 3.81e-02 1.82e-02 2.10 0.042 *
## PopulationEstimate2018 -5.57e-05 2.94e-05 -1.90 0.064 .
## PopTotalMale2017 1.06e-04 5.64e-05 1.88 0.067 .
## PopulationEstimate_above65_2017 3.67e-05 2.00e-05 1.83 0.073 .
## PopulationDensityperSqMile2010 -2.84e-05 1.52e-04 -0.19 0.852
## DiabetesPercentage -4.58e-01 2.28e-01 -2.01 0.051 .
## Smokers_Percentage 1.93e-01 9.88e-02 1.96 0.057 .
## HeartDiseaseMortality -1.62e-02 1.09e-02 -1.48 0.147
## StrokeMortality 9.04e-02 5.25e-02 1.72 0.092 .
## days 2.01e-03 1.60e-03 1.25 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.177 on 44 degrees of freedom
## Multiple R-squared: 0.325, Adjusted R-squared: 0.11
## F-statistic: 1.51 on 14 and 44 DF, p-value: 0.147
summary(mod_k_6)
##
## Call:
## lm(formula = k ~ lamda + c + alpha + omega + miu, data = data_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0289 -0.0445 0.0356 0.0886 0.2567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1002 0.0791 13.90 <2e-16 ***
## lamda -2.7175 1.2218 -2.22 0.030 *
## c -0.1608 0.0772 -2.08 0.042 *
## alpha 2.0142 1.1260 1.79 0.079 .
## omega -36.1277 24.4670 -1.48 0.146
## miu 0.8043 1.0704 0.75 0.456
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.18 on 53 degrees of freedom
## Multiple R-squared: 0.162, Adjusted R-squared: 0.0833
## F-statistic: 2.05 on 5 and 53 DF, p-value: 0.0859
mod_k = mod_k_5
diagnostics(mod_k, testit = FALSE)
sigma# mod_sigma =
lambda# mod_lambda =
c# mod_c =
alpha# mod_alpha =
omega# mod_omega =
miu# mod_miu =
wait
What we have achieved is a simple, interpretable, accessible model that anyone can get their hands on. It not only works towards assisting the prediction and prevention of pandemic, but also gives us clues on the relationship between variable social factors and the vulnerability of one place to the virus.
This is our very first step to make our own contribution to the control of the pandemic. We believe that everyone can take a step to do their parts, and we need not be freaked out by the technological obstacles and the already-done achievements by the “professionals”.
My partner and I are both undergraduates in non-CS majors, and this is our first time touching AWS or any other cloud service system. We started everything offline, accomplishing every tasks locally, and it did take us a while to figure out where to incorporate all those into AWS. But soon we see the enriched potentials and capability of AWS. Due to the limited time, we only took a slight bite of the AWS system, but we will be working to dig deeper.
The next step to take is to integrate everything into an efficient cloud computing system so that larger data and more sophisticated models could be handled, especially in a well-automated manner.
There are interesting stuffs like the political factor. We may wanna see how it affects We have learned a lot from this experience and we are looking forwards to becoming professional data analytist…blabla
get_bp_decision = function(model, alpha) {
decide = unname(bptest(model)$p.value < alpha)
ifelse(decide, "Reject", "Fail to Reject")
}
get_bp_pval = function(model) {
bptest(model)$p.value
}
get_sw_decision = function(model, alpha) {
decide = unname(shapiro.test(resid(model))$p.value < alpha)
ifelse(decide, "Reject", "Fail to Reject")
}
get_sw_pval = function(model) {
shapiro.test(resid(model))$p.value
}
get_num_params = function(model) {
length(coef(model))
}
get_loocv_rmse = function(model, is_log) {
ifelse(
is_log,
sqrt(mean(na.omit(((dat_trn$price - exp(fitted(model))) / (1 - hatvalues(model))) ^ 2))),
sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
)
}
get_adj_r2 = function(model) {
summary(model)$adj.r.squared
}
test_mod = function(model, is_log = FALSE, k = 1){
c(loocv_rmse = get_loocv_rmse(model, is_log),
adj_r2 = get_adj_r2(model),
bp_pval = get_bp_pval(model),
sw_pval = get_sw_pval(model),
num_params = get_num_params(model),
test_rmse = get_test_rmse(model, k),
perc_err = get_perc_err(model, k))
}
diagnostics = function(model, pcol = "grey", lcol = "dodgerblue", alpha = 0.05, plotit = TRUE, testit = TRUE){
if (plotit){
par(mfrow = c(1, 2), pty="s")
plot(fitted(model), resid(model), col = "grey", pch = 20,
xlab = "Fitted", ylab = "Residual",
main = "Fitted versus Residuals")
abline(h = 0, col = "darkorange", lwd = 2)
qqnorm(resid(model), col = pcol)
qqline(resid(model), col = lcol, lwd = 2)
}
if (testit){
list(p_val = shapiro.test(resid(model))$p,
decision = ifelse(test = shapiro.test(resid(model))$p < alpha,
yes = "Reject", no = "Fail to Reject"))
}
}
get_test_rmse = function(model, k) {
sqrt(mean((data_tst[, k] - predict(model, newdata = data_tst))^ 2))
}
get_perc_err = function(model, k) {
actual = data_tst[, k]
predicted = predict(model, newdata = data_tst)
100 * mean((abs(actual - predicted)) / actual)
}